View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: MultiThreadHttpCallRetriever.java,v 1.2 2005/08/05 15:55:53 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler.retriever;
28  
29  import java.io.ByteArrayOutputStream;
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.net.URL;
33  import org.apache.commons.httpclient.Header;
34  import org.apache.commons.httpclient.HostConfiguration;
35  import org.apache.commons.httpclient.HttpClient;
36  import org.apache.commons.httpclient.HttpConnectionManager;
37  import org.apache.commons.httpclient.HttpMethod;
38  import org.apache.commons.httpclient.HttpStatus;
39  import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
40  import org.apache.commons.httpclient.cookie.CookiePolicy;
41  import org.apache.commons.httpclient.methods.PostMethod;
42  import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
43  import org.apache.log4j.Logger;
44  import org.smartcrawler.common.AbstractParametrizableComponent;
45  import org.smartcrawler.common.Context;
46  import org.smartcrawler.common.Link;
47  import org.smartcrawler.common.MalformedLinkException;
48  import org.smartcrawler.common.SCLogger;
49  import org.smartcrawler.extractor.HtmlURL;
50  import org.smartcrawler.extractor.HtmlURLImpl;
51  import org.smartcrawler.extractor.LinkBuilderImpl;
52  
53  /***
54   *
55   *
56   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
57   * @version <tt>$Revision: 1.2 $</tt>
58   */
59  public class MultiThreadHttpCallRetriever extends HttpCallRetriever implements Retriever {
60  
61      protected HttpClient httpClient;
62      private static Logger log = SCLogger.getLogger(MultiThreadHttpCallRetriever.class);
63  
64      /*** The max number of http connections per host. */
65      protected static final int DEFAULT_MAX_CONN_PER_HOST = 30;
66  
67      /*** The max number of http connections. */
68      protected static final int DEFAULT_MAX_TOTAL_CONN = 30;
69  
70      /*** The connection timeout. */
71      protected static final int DEFAULT_CONN_TIMEOUT = 10000;
72  
73      /*** The SO connection timeout. */
74      protected static final int DEFAULT_SO_TIMEOUT = 10000;
75  
76      /***
77       * Creates a new instance of HttpRetriever
78       * @param host
79       */
80      public MultiThreadHttpCallRetriever() {
81          this.httpClient = createHttpClient();
82          log.info("Created multiThread retriever");
83      }
84  
85      protected HttpClient getHttpClient() {
86          return this.httpClient;
87      }
88  
89      /***
90       * Method which creates the default httpClient
91       *
92       * @param isMultiThread
93       * @return
94       */
95      protected HttpClient createHttpClient() {
96          log.debug("createHttpClient: BEGIN");
97          //HttpClient httpClient = null;
98          HttpConnectionManager connMan = null;
99          connMan = new MultiThreadedHttpConnectionManager();
100         HttpConnectionManagerParams par = new HttpConnectionManagerParams();
101         par.setDefaultMaxConnectionsPerHost(DEFAULT_MAX_CONN_PER_HOST);
102         par.setMaxTotalConnections(DEFAULT_MAX_TOTAL_CONN);
103         par.setConnectionTimeout(DEFAULT_CONN_TIMEOUT);
104         par.setSoTimeout(DEFAULT_SO_TIMEOUT);
105         connMan.setParams(par);
106         log.debug("createHttpClient: END");
107         HttpClient client = new HttpClient(connMan);
108         client.getState().setCookiePolicy(CookiePolicy.COMPATIBILITY);
109         return client;
110     }
111 }